home *** CD-ROM | disk | FTP | other *** search
- /*
- * recovery.c --
- *
- * The routines here maintain up/down state about other hosts.
- * Other modules register as clients of the recovery module,
- * and can then ask to be called back when some other host crashes
- * or reboots. Modules always get called back when someone crashes,
- * and then they have the option of being called back when the
- * host reboots. Regular message traffic plus explicit pinging
- * are used to track the state of the other hosts. Pinging is
- * only done if some module is explicitly interested in a host.
- *
- * Recov_HostAlive and Recov_HostDead are used by RPC to tell us when
- * a messages have arrived, or if transactions have timed out.
- * Recov_IsHostDown is used to query the state of another host,
- * Recov_RebootCallBack is used to get a callback upon a reboot, and
- * Recov_WaitForHost is used to block a process until a host reboots.
- * (Recov_WaitForHost isn't used much. Instead, modules rely on the
- * recovery callbacks to indicate that a host is back to life, and
- * they block processes in their own way.)
- *
- * Note: A synchronization hook is provided by Recov_HostAlive; its
- * caller can be blocked if crash recovery actions are in progress.
- *
- * Copyright 1987 Regents of the University of California
- * All rights reserved.
- */
-
- #ifndef lint
- static char rcsid[] = "$Header: /cdrom/src/kernel/Cvsroot/kernel/recov/recovery.c,v 9.27 92/12/13 18:20:14 mgbaker Exp $ SPRITE (Berkeley)";
- #endif /* not lint */
-
-
- #include <sprite.h>
- #include <recov.h>
- #include <sync.h>
- #include <net.h>
- #include <rpc.h>
- #include <hash.h>
- #include <stdlib.h>
- #include <trace.h>
- #include <fsutil.h>
- #include <bstring.h>
- #include <stdio.h>
- #include <devClientDev.h>
-
- /*
- * Other kernel modules arrange call-backs when a host crashes or reboots.
- * The following list structure is used to keep these. The calling
- * sequence of the callbacks is as follows:
- * (*proc)(spriteID, clientData)
- * Use Recov_CrashRegister and Recov_RebootRegister to set up the call backs.
- */
-
- typedef struct {
- List_Links links;
- void (*proc)();
- int refCount;
- ClientData data;
- } NotifyElement;
-
- /*
- * There is a single list of crash call backs, it isn't per machine
- * like the reboot callbacks.
- */
- static List_Links crashCallBackList;
-
- /*
- * recov_CrashDelay is the grace period given when another host
- * is apparently down. Reboots are still detected so that
- * the crash callbacks will get called to clean up.
- */
- unsigned int recov_CrashDelay;
-
- /*
- * Statistics about the recovery module.
- */
- Recov_Stats recov_Stats;
-
- /*
- * For per-client statistics about recovery on the server.
- * This is only used for getting stats, and isn't used for keeping
- * track of client state.
- * This amounts to a per-host list, in array form.
- * Each host has numTries elements in the array. The spriteID and numTries
- * fields are only initialized in the first element.
- */
- typedef struct RecovPerHostInfo {
- int spriteID; /* Sprite ID of client. */
- Time start; /* First recovery attempt. */
- Time finished; /* First recovery attempt finished. */
- int numTries; /* Number of recovery attempts. */
- int numHandles; /* Number of reopens requested. */
- int numSuccessful; /* Handles successfully recovered. */
- } RecovPerHostInfo;
-
-
-
- /*
- * The state of other hosts is kept in a hash table keyed on SpriteID.
- * This state is maintained by Recov_HostAlive and Recov_HostDead, which are
- * called in turn after packet reception or RPC timeout, respectively.
- * Recov_HostDead is also called by the Rpc_Daemon if it can't get an
- * explicit acknowledgment from a client.
- */
- static Hash_Table recovHashTableStruct;
- static Hash_Table *recovHashTable = &recovHashTableStruct;
-
- typedef struct RecovStampList {
- List_Links timeStampList;
- Timer_Ticks start;
- Timer_Ticks finished;
- int numHandles; /* Handles since last time. */
- int numSuccessful; /* Successful last time. */
- } RecovStampList;
-
- typedef struct RecovHostState {
- int state; /* flags defined in .h file */
- int clientState; /* flags defined in recov.h */
- int spriteID; /* Sprite Host ID */
- unsigned int bootID; /* Boot timestamp from RPC header */
- Time time; /* Time of last message */
- Sync_Condition alive; /* Notified when host comes up */
- Sync_Condition recovery; /* Notified when recovery is complete */
- List_Links rebootList; /* List of callbacks for when this
- * host reboots. */
- int numFailures; /* Times a failure occurs during the
- * reboot callbacks. Such a failure
- * triggers a retry of the reboot
- * callbacks. */
- /*
- * The following fields are used in the tracing of the recovery module.
- */
- Timer_Ticks start; /* Time that recovery is started. */
- Timer_Ticks finished; /* Time recovery attempt finishes. */
- int numTries; /* Number of times recov attempted. */
- int numHandles; /* Handles requested. */
- int numSuccessful; /* Successful handles. */
- int currentHandles; /* Temporary info. */
- int currentSuccessful;
- List_Links timeStampList; /* List of time stamps for recovery. */
- int oldState; /* Used for screening out trace recs. */
- ClientData callToken; /* Token for timeout callback to
- * do recovery with server if server
- * didn't contact us. We must be
- * able to deschedule the callback, so
- * that's what this token is for. */
- Sync_Condition waitForServer; /* Waiting for server-driven recovery
- * to wake us. Or else for our
- * timeout to wake us.
- */
- } RecovHostState;
-
- #define RECOV_INIT_HOST(hostPtr, zspriteID, zstate, zbootID) \
- hostPtr = (RecovHostState *) malloc(sizeof (RecovHostState)); \
- (void)bzero((Address)hostPtr, sizeof(RecovHostState)); \
- List_Init(&(hostPtr)->rebootList); \
- List_Init(&(hostPtr)->timeStampList);\
- (hostPtr)->spriteID = zspriteID; \
- (hostPtr)->state = zstate; \
- (hostPtr)->bootID = zbootID; \
- (hostPtr)->numFailures = 0; \
- (hostPtr)->oldState = 0; \
- (hostPtr)->callToken = (ClientData) NIL;
-
- /*
- * Access to the hash table is monitored.
- */
- static Sync_Lock recovLock;
- #define LOCKPTR (&recovLock)
-
-
- /*
- * recov_PrintLevel defines how noisey we are about other hosts.
- * Values for the print level should be defined in increasing order.
- */
- int recov_PrintLevel = RECOV_PRINT_REBOOT;
-
- #define RecovHostPrint(level, spriteID, message) \
- if (recov_PrintLevel >= level) { \
- Sys_HostPrint(spriteID, message); \
- }
-
- Trace_Header recovTraceHdr;
- Trace_Header *recovTraceHdrPtr = &recovTraceHdr;
- int recovTraceLength = 200;
- Boolean recovTracing = TRUE;
-
- /*
- * TRUE if we're using transparent server recovery (using recovery box).
- */
- #ifdef RECOV_TRANSPARENT
- Boolean recov_Transparent = TRUE;
- #else
- Boolean recov_Transparent = FALSE;
- #endif /* RECOV_TRANSPARENT */
-
- /*
- * True if we should batch together multiple reopens in an rpc.
- */
- Boolean recov_BulkHandles = TRUE;
-
- /*
- * TRUE if the clients should ignore the fact that a server is able to
- * do transparent recovery. This will be FALSE except for some testing.
- */
- Boolean recov_ClientIgnoreTransparent = FALSE;
- /*
- * This one should usually be true: okay to do fast reboot as far as
- * reusing text and initialized heap, but this doesn't have to include
- * using the recovery box.
- */
- #ifdef RECOV_NOCOPY
- Boolean recov_DoInitDataCopy = FALSE;
- #else
- Boolean recov_DoInitDataCopy = TRUE;
- #endif /* RECOV_NOCOPY */
-
- /*
- * TRUE if we're recovering using server-driven method.
- * This is set to true by an initialization routine that is called
- * as a result of the user-level daemon contacting the kernel.
- */
- Boolean recov_ServerDriven = FALSE;
-
- /*
- * TRUE if the clients should ignore server-driven recovery.
- */
- Boolean recov_ClientIgnoreServerDriven = TRUE;
- /*
- * Don't bother to reopen files that only have clean blocks in the cache.
- * Invalidate their clean cache blocks and scavenge the handles.
- * This variable and recov_SkipCleanFiles should never both be set true!
- */
- Boolean recov_IgnoreCleanFiles = FALSE;
-
- /*
- * Don't bother to reopen files that only have clean blocks in the cache.
- * Just leave alone their cache blocks and handles.
- * This variable and recov_IgnoreCleanFiles should never both be set true!
- */
- Boolean recov_SkipCleanFiles = TRUE;
-
- /*
- * Are we blocking out some rpc's because server-driven recovery is
- * in progress?
- */
- Boolean recov_BlockingRpcs = FALSE;
-
- /*
- * Forward declarations.
- */
-
- static void CrashCallBacks _ARGS_((ClientData data, Proc_CallInfo *callInfoPtr));
- static void CallBacksDone _ARGS_((int spriteID));
- static void MarkRecoveryComplete _ARGS_((int spriteID));
- static void GetRebootList _ARGS_((List_Links *notifyListHdr, int spriteID));
- static char *GetState _ARGS_((int state));
- static void PrintExtraState _ARGS_((RecovHostState *hostPtr));
-
-
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_Init --
- *
- * Set up the data structures used by the recovery module.
- *
- * Results:
- * None.
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
- void
- Recov_Init()
- {
- Sync_LockInitDynamic(&recovLock, "Recov:recovLock");
- Hash_Init(recovHashTable, 8, HASH_ONE_WORD_KEYS);
- List_Init(&crashCallBackList);
- Trace_Init(recovTraceHdrPtr, recovTraceLength,
- sizeof(RecovTraceRecord), 0);
- Fsutil_InitBulkReopenTables();
- recov_CrashDelay = (unsigned int)(timer_IntOneMinute);
- RecovPingInit();
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_CrashRegister --
- *
- * This procedure is used to register a crash callback procedure.
- * This is typically done once at boot time by each module that
- * is interested in learning about the failure of other hosts.
- * When other hosts are (apparently) down the recovery module
- * calls back to other modules that have registered via this procedure.
- * This allows those other modules to clean up any state associated
- * with the crashed host.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Callback entry added to the crash call-back list.
- *
- *----------------------------------------------------------------------
- */
- void
- Recov_CrashRegister(crashCallBackProc, crashData)
- void (*crashCallBackProc)();
- ClientData crashData;
- {
- register NotifyElement *notifyPtr;
-
- notifyPtr = (NotifyElement *) malloc(sizeof (NotifyElement));
- notifyPtr->proc = crashCallBackProc;
- notifyPtr->data = crashData;
- List_InitElement((List_Links *) notifyPtr);
- List_Insert((List_Links *) notifyPtr, LIST_ATREAR(&crashCallBackList));
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_RebootRegister --
- *
- * Schedule a callback for when a particular host reboots.
- * To make sure we detect a crash, the recovery module has to
- * periodically check on the state of the target host.
- *
- * Results:
- * None.
- *
- * Side effects:
- * This initiate a background callback to check-up on the host's state.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY void
- Recov_RebootRegister(spriteID, rebootCallBackProc, rebootData)
- int spriteID;
- void (*rebootCallBackProc)();
- ClientData rebootData;
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
- register NotifyElement *notifyPtr;
- Boolean found = FALSE;
-
- LOCK_MONITOR;
-
- if (spriteID <= 0 || spriteID == rpc_SpriteID) {
- panic("Recov_RebootRegister, bad hostID %d\n", spriteID);
- } else {
- hashPtr = Hash_Find(recovHashTable, (Address)spriteID);
- if (hashPtr->value == (Address)NIL) {
- RECOV_INIT_HOST(hostPtr, spriteID, RECOV_STATE_UNKNOWN, 0);
- hashPtr->value = (Address)hostPtr;
- RECOV_TRACE(spriteID, RECOV_STATE_UNKNOWN, RECOV_CUZ_INIT);
- } else {
- hostPtr = (RecovHostState *)hashPtr->value;
- }
- /*
- * Save the callback while avoiding duplications.
- */
- LIST_FORALL(&hostPtr->rebootList, (List_Links *)notifyPtr) {
- if (notifyPtr->proc == rebootCallBackProc &&
- notifyPtr->data == rebootData) {
- found = TRUE;
- break;
- }
- }
- if (!found) {
- notifyPtr = (NotifyElement *) malloc(sizeof (NotifyElement));
- notifyPtr->proc = rebootCallBackProc;
- notifyPtr->data = rebootData;
- notifyPtr->refCount = 1;
- List_InitElement((List_Links *)notifyPtr);
- List_Insert((List_Links *)notifyPtr,
- LIST_ATFRONT(&hostPtr->rebootList));
- } else {
- notifyPtr->refCount++;
- }
- /*
- * Mark the host as being interesting, and add it to the ping
- * list if necessary.
- */
- hostPtr->state |= RECOV_PINGING_HOST;
- RecovAddHostToPing(spriteID);
- }
- UNLOCK_MONITOR;
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_RebootUnRegister --
- *
- * Remove a callback for when a particular host reboots. This is
- * used after we are no longer interested in a host rebooting.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Nukes the reboot procedure. If all interested parties remove their
- * reboot callbacks then the periodic check of the other host is
- * stopped.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY void
- Recov_RebootUnRegister(spriteID, rebootCallBackProc, rebootData)
- int spriteID;
- void (*rebootCallBackProc)();
- ClientData rebootData;
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
- register NotifyElement *notifyPtr;
- Boolean found = FALSE;
-
- LOCK_MONITOR;
-
- if (spriteID <= 0 || spriteID == rpc_SpriteID) {
- panic("Recov_RebootUnRegister, bad hostID %d\n", spriteID);
- } else {
- hashPtr = Hash_Find(recovHashTable, (Address)spriteID);
- if (hashPtr->value == (Address)NIL) {
- RECOV_INIT_HOST(hostPtr, spriteID, RECOV_STATE_UNKNOWN, 0);
- hashPtr->value = (Address)hostPtr;
- RECOV_TRACE(spriteID, RECOV_STATE_UNKNOWN, RECOV_CUZ_INIT);
- } else {
- hostPtr = (RecovHostState *)hashPtr->value;
- }
- /*
- * Look for the matching callback.
- */
- LIST_FORALL(&hostPtr->rebootList, (List_Links *)notifyPtr) {
- if (notifyPtr->proc == rebootCallBackProc &&
- notifyPtr->data == rebootData) {
- found = TRUE;
- break;
- }
- }
- if (found) {
- notifyPtr->refCount--;
- if (notifyPtr->refCount <= 0) {
- int num;
- /*
- * Mousetrap for debugging recovery reference count problem.
- */
- if (notifyPtr->proc == (void((*)())) Fsutil_Reopen) {
-
- if (recov_PrintLevel >= RECOV_PRINT_CRASH) {
- printf(
- "Recov: deleting Fsutil_Reopen for server %d ref count %d\n",
- spriteID, notifyPtr->refCount);
- }
- /*
- * We want to panic if we still have handles for
- * this server.
- */
- num = Fsutil_TestForHandles(spriteID);
- /*
- * This routine is called before the handle is removed,
- * so we must take into account the fact that it still
- * exists in the handle table.
- */
- if (num > 1) {
- printf("%d file and device handles remain\n", num);
- panic("Shouldn't have deleted it - handles remain!\n");
- }
- }
- List_Remove((List_Links *)notifyPtr);
- free((Address)notifyPtr);
- }
- }
- }
- UNLOCK_MONITOR;
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_HostAlive --
- *
- * Mark the host as being alive. This is called when we've received
- * a message from the host. It uses state from the host table and
- * the bootID parameter to detect reboots. If a reboot is detected,
- * but we thought the host was up, then the Crash call-backs are invoked.
- * In any case, a reboot invokes the Reboot call-backs, if any.
- *
- * This procedure is called from client RPC upon successful completion
- * of an RPC, and by server RPC upon reciept of a client request.
- * These two cases are identified by the 'asyncRecovery' parameter.
- * Servers want synchronous recovery so they don't service anything
- * until state associated with that client has been cleaned up via
- * the Crash call-backs. So Recov_HostAlive blocks (if !asyncRecovery)
- * until the crash call-backs are complete. Clients don't have the
- * same worries so they let the crash call-backs complete in the
- * background (asyncRecovery is TRUE).
- *
- * Results:
- * None.
- *
- * Side effects:
- * Updates the boot timestamp of the other host. Procedures installed
- * with Recov_CrashRegister are called when the bootID changes. A
- * timestamp of when this message was received is obtained from the
- * "cheap" clock so we can tell later if there has been recent message
- * traffic.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY void
- Recov_HostAlive(spriteID, bootID, asyncRecovery, rpcNotActive, recovType)
- int spriteID; /* Host ID of the message sender */
- unsigned int bootID; /* Boot time stamp from message header */
- Boolean asyncRecovery; /* TRUE means do recovery call-backs in
- * the background. FALSE causes the process
- * to wait until crash recovery is complete. */
- Boolean rpcNotActive; /* This is a flag propogated from the rpc
- * packet header. If set it means the RPC
- * system on the remote host isn't fully
- * turned on. Reboot recovery is delayed
- * until this changes. */
- unsigned int recovType; /* Whether the host that's alive went through
- * a fast boot or is doing server-driven
- * recovery or not. */
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
- int hostState = -1;
-
- LOCK_MONITOR;
- if (spriteID == NET_BROADCAST_HOSTID || bootID == 0 || sys_ShuttingDown) {
- /*
- * Don't track the broadcast address. Also ignore zero valued
- * bootIDs. These come from hosts at early boot time, or
- * in certain error conditions like trying to send too much
- * data in a single RPC. Also don't bother to check things
- * where we are shutting down the system because we don't want
- * RPCs for the cache data to get blocked.
- */
- UNLOCK_MONITOR;
- return;
- }
-
- recov_Stats.packets++;
- hashPtr = Hash_Find(recovHashTable, (Address)spriteID);
- if (hashPtr->value == (Address)NIL) {
- /*
- * Initialize the host's state. This is the first time we've talked
- * to it since we've been up, so take no action.
- */
- RECOV_INIT_HOST(hostPtr, spriteID, RECOV_HOST_ALIVE, bootID);
- hashPtr->value = (Address)hostPtr;
-
- RecovHostPrint(RECOV_PRINT_IF_UP, spriteID, "is up\n");
- RECOV_TRACE(spriteID, RECOV_HOST_ALIVE, RECOV_CUZ_INIT);
- hostState = DEV_CLIENT_STATE_NEW_HOST;
- } else {
- hostPtr = (RecovHostState *)hashPtr->value;
- }
- /*
- * Have to read the clock in order to suppress repeated pings,
- * see Recov_GetHostState and Recov_IsHostDown.
- */
- Timer_GetTimeOfDay(&hostPtr->time, (int *)NIL, (Boolean *)NIL);
- /*
- * Check for a rebooted peer by comparing boot time stamps.
- */
- if (hostPtr->bootID != bootID) {
- if (hostPtr->bootID != 0) {
- RecovHostPrint(RECOV_PRINT_REBOOT, spriteID, "rebooted\n");
- } else {
- /*
- * We initialized state before talking to the host the first time.
- * The state is 'unknown' so we won't do crash call-backs.
- */
- }
- hostPtr->bootID = bootID;
- RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_REBOOT);
- if (hostPtr->state &
- (RECOV_HOST_ALIVE|RECOV_HOST_BOOTING)) {
- RecovHostPrint(RECOV_PRINT_ALL, spriteID,
- "Undetected crash occurred.\n");
- /*
- * A crash occured un-detected. We do the crash call-backs
- * first, and block server processes in the meantime.
- * RECOV_CRASH_CALLBACKS flag is cleared by CrashCallBacks.
- */
- hostPtr->state &= ~(RECOV_HOST_ALIVE|RECOV_HOST_DEAD);
- hostPtr->state |= RECOV_HOST_BOOTING;
- RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_CRASH_UNDETECTED);
- if ((hostPtr->state & RECOV_CRASH_CALLBACKS) == 0) {
- hostPtr->state |= RECOV_CRASH_CALLBACKS;
- RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_CRASH_UNDETECTED);
- Proc_CallFunc(CrashCallBacks, (ClientData)spriteID, 0);
- }
- }
- } else if ( ! (hostPtr->state &
- (RECOV_CRASH_CALLBACKS|RECOV_WANT_RECOVERY)) &&
- (hostPtr->state & RECOV_HOST_ALIVE)) {
- /*
- * Fast path. We already think the other host is up, it didn't
- * reboot, we don't want recovery, and there are no pending
- * crash call-backs to synchronize with.
- */
- goto exit;
- }
- /*
- * Block servers until crash recovery actions complete.
- * This prevents servicing requests from clients until after the
- * recovery actions complete.
- */
- if (! asyncRecovery) {
- RecovHostPrint(RECOV_PRINT_ALL, spriteID, "Async recovery false.\n");
- while (hostPtr->state & RECOV_CRASH_CALLBACKS) {
- (void)Sync_Wait(&hostPtr->recovery, FALSE);
- if (sys_ShuttingDown) {
- UNLOCK_MONITOR;
- Proc_Exit(1);
- }
- }
- }
- /*
- * Now that we've taken care of crash recovery, we see if the host
- * is newly up. If so, invoke any reboot call-backs and notify
- * waiting processes. This means clientA (us) may start
- * re-opening files from serverB (the other guy) at the same time
- * as clientA (us) is closing files that serverB had had open.
- * ie. both the crash and reboot call backs may proceed in parallel.
- */
- switch(hostPtr->state &
- (RECOV_HOST_ALIVE|RECOV_HOST_BOOTING|RECOV_HOST_DEAD)) {
- case RECOV_STATE_UNKNOWN: /* This is zero, no bits set */
- /*
- * We have uninitialized state for the host, mark it alive.
- */
- RecovHostPrint(RECOV_PRINT_IF_UP, spriteID, "is up\n");
- if (rpcNotActive) {
- hostPtr->state |= RECOV_HOST_BOOTING;
- } else {
- hostPtr->state |= RECOV_HOST_ALIVE;
- hostState = DEV_CLIENT_STATE_NEW_HOST;
- }
- break;
- case RECOV_HOST_ALIVE:
- /*
- * Host already alive. We may still want recovery at this
- * point. See CallBacksDone.
- */
- RecovHostPrint(RECOV_PRINT_ALL, spriteID, "Already up.\n");
- break;
- case RECOV_HOST_BOOTING:
- /*
- * See if a booting host is ready yet.
- */
- RecovHostPrint(RECOV_PRINT_ALL, spriteID, "Booting, set recov.\n");
- if (! rpcNotActive) {
- hostPtr->state &= ~RECOV_HOST_BOOTING;
- hostPtr->state |= RECOV_HOST_ALIVE|RECOV_WANT_RECOVERY;
- hostState = DEV_CLIENT_STATE_NEW_HOST;
- RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_WAS_BOOTING);
- RecovHostPrint(RECOV_PRINT_ALL, spriteID,
- "Booting, set alive, recov.\n");
- }
- break;
- case RECOV_HOST_DEAD:
- /*
- * See if the host is newly booting or back from a net partition.
- */
- if (rpcNotActive) {
- hostPtr->state |= RECOV_HOST_BOOTING;
- RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_NOW_BOOTING);
- RecovHostPrint(RECOV_PRINT_ALL, spriteID,
- "Dead or dying, set booting.\n");
- } else {
- hostPtr->state |= (RECOV_HOST_ALIVE|RECOV_WANT_RECOVERY);
- hostState = DEV_CLIENT_STATE_NEW_HOST;
- RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_WAS_DEAD);
- RecovHostPrint(RECOV_PRINT_ALL, spriteID,
- "Dead, dying, set want recov.\n");
- }
- hostPtr->state &= ~(RECOV_HOST_DEAD);
- break;
- default:
- printf("Unexpected recovery state <%x> for ", hostPtr->state);
- Sys_HostPrint(spriteID, "\n");
- break;
- }
- if (recovType & RECOV_FAST_BOOT) {
- if (!(hostPtr->state & RECOV_FAST_BOOT)) {
- printf("Recov_HostAlive: setting state for host %d to FAST_BOOT\n",
- spriteID);
- }
- hostPtr->state |= RECOV_FAST_BOOT;
- } else {
- if ((hostPtr->state & RECOV_FAST_BOOT)) {
- printf("Recov_HostAlive: removing FAST_BOOT state for host %d\n",
- spriteID);
- }
- hostPtr->state &= ~RECOV_FAST_BOOT;
- }
- /* Test whether it's trying to do more than one kind of recovery? XXX */
- if (recovType & RECOV_SERVER_DRIVEN) {
- if (!(hostPtr->state & RECOV_SERVER_DRIVEN)) {
- printf(
- "Recov_HostAlive: setting state for host %d to SERVER_DRIVEN\n",
- spriteID);
- }
- hostPtr->state |= RECOV_SERVER_DRIVEN;
- } else {
- if ((hostPtr->state & RECOV_SERVER_DRIVEN)) {
- printf(
- "Recov_HostAlive: removing SERVER_DRIVEN state for host %d\n",
- spriteID);
- }
- hostPtr->state &= ~RECOV_SERVER_DRIVEN;
- }
- /*
- * After a host comes up enough to support RPC service, we
- * initiate reboot recovery if needed.
- */
- if ((hostPtr->state & RECOV_WANT_RECOVERY) &&
- (hostPtr->state & RECOV_HOST_ALIVE) &&
- ((hostPtr->state & RECOV_REBOOT_CALLBACKS) == 0)) {
- hostPtr->state &= ~RECOV_WANT_RECOVERY;
- hostPtr->state |= RECOV_REBOOT_CALLBACKS;
- RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_SCHED_CALLBACK);
- RecovHostPrint(RECOV_PRINT_ALL, spriteID,
- "Want recov, etc, callbacks.\n");
- Proc_CallFunc(RecovRebootCallBacks, (ClientData)spriteID, 0);
- }
- exit:
- if (hostState == DEV_CLIENT_STATE_NEW_HOST) {
- /*
- * Only call into device module if this isn't a client we're
- * doing server-driven recovery with at this very moment.
- */
- if ((hostPtr->clientState & CLT_DOING_SRV_RECOV) == 0) {
- Dev_ClientHostUp(spriteID);
- }
- }
- UNLOCK_MONITOR;
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_HostDead --
- *
- * Change the host's state to "dead". This is called from client RPC
- * when an RPC timed out with no response. It is also called by the
- * Rpc_Daemon when it can't recontact a client to get an explicit
- * acknowledgment.
- *
- * Results:
- * None.
- *
- * Side effects:
- * If the host was previously thought up, this sets the state in
- * the host state table to dead and invokes the crash callbacks.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY void
- Recov_HostDead(spriteID)
- int spriteID;
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
- int hostState = -1;
-
- LOCK_MONITOR;
- if (spriteID == NET_BROADCAST_HOSTID || rpc_NoTimeouts) {
- /*
- * If rpcNoTimeouts is set the Rpc_Daemon may still call us if
- * it can't get an acknowledgment from a host to close down
- * a connection. We ignore this so that we don't take action
- * against the offending host (who is probably in the debugger)
- * (Hmm, it doesn't look like Rpc_Daemon calls this procedure.)
- */
- UNLOCK_MONITOR;
- return;
- }
-
- recov_Stats.timeouts++;
- hashPtr = Hash_Find(recovHashTable, (Address)spriteID);
- if (hashPtr->value == (Address)NIL) {
- RECOV_INIT_HOST(hostPtr, spriteID, RECOV_HOST_DEAD, 0);
- hashPtr->value = (Address)hostPtr;
- RECOV_TRACE(spriteID, RECOV_HOST_DEAD, RECOV_CUZ_INIT);
- hostState = DEV_CLIENT_STATE_DEAD_HOST;
- } else {
- hostPtr = (RecovHostState *)hashPtr->value;
- }
- switch(hostPtr->state &
- (RECOV_HOST_ALIVE|RECOV_HOST_BOOTING|RECOV_HOST_DEAD)) {
- case RECOV_HOST_DEAD:
- /*
- * Host already dead.
- */
- break;
- case RECOV_STATE_UNKNOWN:
- case RECOV_HOST_BOOTING:
- case RECOV_HOST_ALIVE:
- hostState = DEV_CLIENT_STATE_DEAD_HOST;
- hostPtr->state &=
- ~(RECOV_HOST_ALIVE|RECOV_HOST_BOOTING);
- /*
- * Special handling if we abort during the recovery protocol.
- * In this case it is possible for the other host to go from
- * alive to dead and back to alive before the recovery protocol
- * finally terminates. If that happens we could loose a reboot
- * event and fail to initiate recovery again. We mark the
- * host specially so the reboot callbacks are retried.
- */
- if (hostPtr->state & RECOV_REBOOT_CALLBACKS) {
- hostPtr->state |= RECOV_FAILURE;
- }
- hostPtr->state |= RECOV_HOST_DEAD|RECOV_CRASH_CALLBACKS;
- RecovHostPrint(RECOV_PRINT_CRASH, spriteID,
- "crash call-backs made\n");
- RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_CRASH);
- Proc_CallFunc(CrashCallBacks, (ClientData)spriteID, 0);
- break;
- }
- if (hostState == DEV_CLIENT_STATE_DEAD_HOST) {
- /*
- * Only call into device module if this isn't a client we're
- * doing server-driven recovery with at this very moment.
- */
- if ((hostPtr->clientState & CLT_DOING_SRV_RECOV) == 0) {
- Dev_ClientHostDown(spriteID);
- }
- }
- UNLOCK_MONITOR;
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_IsHostDown --
- *
- * This decides if the specified host is down. If the host is known
- * to be down this routine returns FAILURE. SUCCESS is returned if
- * the host is alive, and RPC_SERVICE_DISABLED is returned if the
- * host is in its boot sequence and can't service RPC's yet. If there
- * hasn't been recent (within the last 10 seconds) message traffic
- * this this pings the host to find out for sure its state.
- *
- * Results:
- * SUCCESS if the host is up, FAILURE if it doesn't respond to
- * pings or is known to be down, and RPC_SERVICE_DISABLED if
- * the host says so.
- *
- * Side effects:
- * May do a ping.
- *
- *----------------------------------------------------------------------
- */
-
- ReturnStatus
- Recov_IsHostDown(spriteID)
- int spriteID;
- {
- register ReturnStatus status = SUCCESS;
-
- if (spriteID == NET_BROADCAST_HOSTID) {
- printf("Warning: Recov_IsHostDown, got broadcast address\n");
- return(SUCCESS);
- }
- switch (Recov_GetHostState(spriteID)) {
- case RECOV_STATE_UNKNOWN:
- RECOV_TRACE(spriteID, RECOV_STATE_UNKNOWN, RECOV_CUZ_PING_ASK);
- recov_Stats.pings++;
- status = Rpc_Ping(spriteID);
- break;
- case RECOV_HOST_BOOTING:
- case RECOV_HOST_ALIVE: /* fake it to allow for the grace period */
- recov_Stats.pingsSuppressed++;
- status = SUCCESS;
- break;
- case RECOV_HOST_DEAD:
- status = FAILURE;
- break;
- }
- return(status);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_HostTrace --
- *
- * Add an entry to the recovery trace.
- *
- * Results:
- * None.
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY void
- Recov_HostTrace(spriteID, event)
- int spriteID;
- int event;
- {
- /*
- * No monitor lock needed here, since Trace_Insert does its own
- * synchronization.
- */
- RECOV_TRACE(spriteID, RECOV_STATE_UNKNOWN, event);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_GetClientState --
- *
- * Return the client state associated with a host. The recovery host
- * table is a convenient object keyed on spriteID. Other modules can
- * set their own state in the table (beyond the simple up/down state
- * mainted by the rest of this module), and retrieve it with this call.
- *
- * Results:
- * A copy of the clientState field. 0 is returned if there is no
- * host table entry.
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY int
- Recov_GetClientState(spriteID)
- int spriteID;
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
- int stateBits = 0;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
- if (hashPtr != (Hash_Entry *)NIL) {
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr != (RecovHostState *)NIL) {
- stateBits = hostPtr->clientState;
- }
- }
- UNLOCK_MONITOR;
- return(stateBits);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_SetClientState --
- *
- * Set a client state bit. This or's the parameter into the
- * client state word. The previous value of the client state
- * word is returned so this procedure can be used like test-and-set.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Sets bits in the clientState field of the host state. This will add
- * an entry to the host table if one doesn't alreay exist. Its RPC
- * up/down state is set to "unknown" in this case.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY int
- Recov_SetClientState(spriteID, stateBits)
- int spriteID;
- int stateBits;
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
- register oldState;
- RecovStampList *stampPtr;
-
-
- LOCK_MONITOR;
-
- hashPtr = Hash_Find(recovHashTable, (Address)spriteID);
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr == (RecovHostState *)NIL) {
- RECOV_INIT_HOST(hostPtr, spriteID, RECOV_STATE_UNKNOWN, 0);
- hashPtr->value = (Address)hostPtr;
- RECOV_TRACE(spriteID, RECOV_STATE_UNKNOWN, RECOV_CUZ_INIT);
- }
- if ((stateBits & CLT_RECOV_IN_PROGRESS) != 0) {
- /*
- * This is a test to see if we get potential inconsistency from lacking
- * backwards compatibility with old kernels on clients and new server-
- * driven recovery. It can go away when everybody is running a kernel
- * with the new recovery stuff in it. - Mary 11/10/92.
- */
- if ((hostPtr->clientState & CLT_OLD_RECOV) && !recov_BlockingRpcs) {
- printf("Recov_SetClient: got late recovery from old client %d.\n",
- spriteID);
- }
-
- if (hostPtr->numTries == 0) {
- /* First recovery attempt */
- if ((hostPtr->clientState & CLT_RECOV_IN_PROGRESS) != 0) {
- printf("No recovery attempt yet, but marked as in progress.");
- }
- Timer_GetCurrentTicks(&hostPtr->start);
- } else {
- /* Add a time-stamp to the recovery list. */
- stampPtr = (RecovStampList *) malloc(sizeof (RecovStampList));
- Timer_GetCurrentTicks(&stampPtr->start);
- List_InitElement((List_Links *) stampPtr);
- List_Insert((List_Links *) stampPtr,
- LIST_ATREAR(&hostPtr->timeStampList));
- /*
- * Clear handle count for this round.
- */
- hostPtr->currentHandles = 0;
- hostPtr->currentSuccessful = 0;
- }
- hostPtr->numTries++;
- }
-
- oldState = hostPtr->clientState;
- hostPtr->clientState |= stateBits;
- UNLOCK_MONITOR;
- return(oldState);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_ClearClientState --
- *
- * Clear client state bits.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Clears bits in the clientState field of the host state. This does
- * nothing if the state doesn't exist.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY void
- Recov_ClearClientState(spriteID, stateBits)
- int spriteID;
- int stateBits;
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr = (RecovHostState *) NIL;
- RecovStampList *stampPtr;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
- if (hashPtr != (Hash_Entry *)NIL) {
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr != (RecovHostState *)NIL) {
- hostPtr->clientState &= ~stateBits;
- }
- }
- /* End of recovery? */
- if ((hostPtr != (RecovHostState *) NIL) &&
- (stateBits & CLT_RECOV_IN_PROGRESS) != 0) {
- /* End of 1st recovery try? */
- if (hostPtr->numTries <= 1) {
- Timer_GetCurrentTicks(&hostPtr->finished);
- /* Final count of handles recovered is in hostPtr. */
- hostPtr->numHandles = hostPtr->currentHandles;
- hostPtr->numSuccessful = hostPtr->currentSuccessful;
- } else {
- if (List_IsEmpty(&hostPtr->timeStampList)) {
- printf("Recov_ClearClientState: timeStampList is empty!\n");
- hostPtr->numSuccessful = 0; /* signal the error */
- } else {
- stampPtr = (RecovStampList *)
- List_Last((List_Links *) &hostPtr->timeStampList);
- Timer_GetCurrentTicks(&stampPtr->finished);
- stampPtr->numHandles = hostPtr->currentHandles;
- stampPtr->numSuccessful = hostPtr->currentSuccessful;
- }
- }
- }
- UNLOCK_MONITOR;
- return;
- }
-
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_AddHandleCountToClientState --
- *
- * Increment count of handles reopened from this client.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Data in per-host recovery info updated.
- *
- *----------------------------------------------------------------------
- */
- ENTRY void
- Recov_AddHandleCountToClientState(type, clientID, status)
- int type; /* Type of handle being reopened. */
- int clientID; /* Id of client requesting reopen. */
- ReturnStatus status; /* Whether the reopen succeeded. */
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr = (RecovHostState *) NIL;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, (Address)clientID);
- if (hashPtr != (Hash_Entry *)NIL) {
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr != (RecovHostState *)NIL) {
- hostPtr->currentHandles++;
- if (status == SUCCESS) {
- hostPtr->currentSuccessful++;
- }
- }
- }
- UNLOCK_MONITOR;
- return;
- }
-
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_DumpClientRecovInfo --
- *
- * Dump out some of the recovery statistics in the per-host info.
- *
- * Results:
- * Returns FAILURE if recovery still in progress. Returns SUCCESS
- * otherwise.
- *
- * Side effects:
- * Info copied into buffer. Size of needed buffer also copied out.
- *
- *----------------------------------------------------------------------
- */
- ENTRY ReturnStatus
- Recov_DumpClientRecovInfo(length, resultPtr, lengthNeededPtr)
- int length; /* size of data buffer */
- Address resultPtr; /* Array of info structs. */
- int *lengthNeededPtr; /* to return space needed */
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
- Hash_Search hashSearch;
- RecovPerHostInfo *infoPtr;
- int numNeeded;
- int numAvail;
-
- LOCK_MONITOR;
-
- /*
- * If recovery still going on, return FAILURE.
- * NOTE: This isn't a sure-fire test. I'm not sure there is one right now.
- */
- if (fsutil_NumRecovering >= 1) {
- UNLOCK_MONITOR;
- return FAILURE;
- }
- if (resultPtr != (Address) NIL) {
- bzero(resultPtr, length);
- }
- numNeeded = 0;
- numAvail = length / sizeof (RecovPerHostInfo);
-
- infoPtr = (RecovPerHostInfo *) resultPtr;
- Hash_StartSearch(&hashSearch);
- for (hashPtr = Hash_Next(recovHashTable, &hashSearch);
- hashPtr != (Hash_Entry *) NIL;
- hashPtr = Hash_Next(recovHashTable, &hashSearch)) {
- hostPtr = (RecovHostState *)hashPtr->value;
-
- /*
- * We need one slot for each host, whether numTries is 0 or 1, plus
- * additional slots for each numTries over 1.
- */
- numNeeded++;
- if (hostPtr->numTries > 1) {
- numNeeded += (hostPtr->numTries - 1);
- }
- if (numNeeded > numAvail) {
- continue;
- }
- /* Why didn't Brent use GetValue()??? */
- if (hostPtr != (RecovHostState *) NIL) {
- RecovStampList *stampPtr;
-
- /* Copy info into buffer */
- infoPtr->spriteID = hostPtr->spriteID;
- infoPtr->numTries = hostPtr->numTries;
- Timer_GetRealTimeFromTicks(hostPtr->start,
- &(infoPtr->start), (int *) NIL, (Boolean *) NIL);
- Timer_GetRealTimeFromTicks(hostPtr->finished,
- &(infoPtr->finished), (int *)NIL, (Boolean *) NIL);
- infoPtr->numHandles = hostPtr->numHandles;
- infoPtr->numSuccessful = hostPtr->numSuccessful;
- LIST_FORALL(&hostPtr->timeStampList, (List_Links *) stampPtr) {
- infoPtr++;
- Timer_GetRealTimeFromTicks(stampPtr->start,
- &infoPtr->start, (int *) NIL, (Boolean *) NIL);
- Timer_GetRealTimeFromTicks(stampPtr->finished,
- &infoPtr->finished, (int *) NIL, (Boolean *) NIL);
- infoPtr->numHandles = stampPtr->numHandles;
- infoPtr->numSuccessful = stampPtr->numSuccessful;
- }
- }
- infoPtr++;
- }
- *lengthNeededPtr = numNeeded * sizeof (RecovPerHostInfo);
- UNLOCK_MONITOR;
-
- return SUCCESS;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * RecovRebootCallBacks --
- *
- * This calls the call-back procedures installed by other modules
- * via Recov_RebootRegister. It is invoked asynchronously from
- * Recov_HostAlive when that procedure detects a reboot.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Invoke the call-backs.
- *
- *----------------------------------------------------------------------
- */
- /*ARGSUSED*/
- void
- RecovRebootCallBacks(data, callInfoPtr)
- ClientData data;
- Proc_CallInfo *callInfoPtr;
- {
- List_Links notifyList;
- register NotifyElement *notifyPtr;
- register int spriteID = (int)data;
-
- printf("RecovRebootCallBacks called for %d\n", spriteID);
- GetRebootList(¬ifyList, spriteID);
- RECOV_TRACE(spriteID, RECOV_STATE_UNKNOWN, RECOV_CUZ_DOING_CALLBACKS);
- recov_Stats.reboots++;
- while (!List_IsEmpty(¬ifyList)) {
- notifyPtr = (NotifyElement *)List_First(¬ifyList);
- (*notifyPtr->proc)(spriteID, notifyPtr->data);
- List_Remove((List_Links *)notifyPtr);
- free((Address)notifyPtr);
- }
- CallBacksDone(spriteID);
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * GetRebootList --
- *
- * Copy out the list of reboot callbacks. The list is protected by
- * a monitor, but we don't want to call any recovery procedures from
- * inside that monitor so we make a copy.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Copy the reboot list off the host state table and return it
- * to our caller who should free up the copied elements.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY static void
- GetRebootList(notifyListHdr, spriteID)
- List_Links *notifyListHdr;
- int spriteID;
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
- register NotifyElement *notifyPtr;
- register NotifyElement *newNotifyPtr;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
- hostPtr = (RecovHostState *)hashPtr->value;
- List_Init(notifyListHdr);
- LIST_FORALL(&hostPtr->rebootList, (List_Links *)notifyPtr) {
- newNotifyPtr = (NotifyElement *) malloc(sizeof (NotifyElement));
- newNotifyPtr->proc = notifyPtr->proc;
- newNotifyPtr->data = notifyPtr->data;
- List_InitElement((List_Links *)newNotifyPtr);
- List_Insert((List_Links *)newNotifyPtr, LIST_ATREAR(notifyListHdr));
- }
- UNLOCK_MONITOR;
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * CallBacksDone --
- *
- * Clear the internal state bit that says callbacks are in progress.
- * This checks to see if there was a communication failure during
- * the reboot callbacks. If so, the WANT_RECOVERY bit is set
- * to ensure that another set of reboot callbacks are made.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Clears RECOV_REBOOT_CALLBACKS and RECOV_FAILURE. May set
- * RECOV_WANT_RECOVERY if RECOV_FAILURE was set.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY static void
- CallBacksDone(spriteID)
- int spriteID;
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
- hostPtr = (RecovHostState *)hashPtr->value;
- hostPtr->state &= ~RECOV_REBOOT_CALLBACKS;
- RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_DONE_CALLBACKS);
- if (hostPtr->state & (RECOV_FAILURE)) {
- /*
- * There has been a communication failure during the reboot callbacks.
- */
- hostPtr->numFailures++;
- hostPtr->state &= ~RECOV_FAILURE;
- hostPtr->state |= RECOV_WANT_RECOVERY;
- RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_FAILURE);
- } else {
- hostPtr->numFailures = 0;
- }
- UNLOCK_MONITOR;
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * CrashCallBacks --
- *
- * Invoked asynchronously so that other modules
- * can clean up behind the crashed host. When done the host
- * is marked as having recovery complete. This unblocks server
- * processes stalled in Recov_HostAlive.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Invoke the crash call-backs.
- * Clears the recovery in progress flag checked in Recov_HostAlive.
- *
- *----------------------------------------------------------------------
- */
-
- static void
- CrashCallBacks(data, callInfoPtr)
- ClientData data;
- Proc_CallInfo *callInfoPtr;
- {
- register NotifyElement *notifyPtr;
- register int spriteID = (int)data;
-
- recov_Stats.crashes++;
- LIST_FORALL(&crashCallBackList, (List_Links *)notifyPtr) {
- if (notifyPtr->proc != (void (*)())NIL) {
- (*notifyPtr->proc)(spriteID, notifyPtr->data);
- }
- }
- MarkRecoveryComplete(spriteID);
- RECOV_TRACE(spriteID, RECOV_CRASH, RECOV_CUZ_DONE);
- callInfoPtr->interval = 0; /* Don't call again */
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * MarkRecoveryComplete --
- *
- * The recovery call-backs have completed, and this procedure's
- * job is to mark that fact in the host hash table and to notify
- * any processes that are blocked in Recov_HostAlive waiting for this.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Sets the state, if any, in the host state table.
- * Notifies the hostPtr->recovery condition
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY static void
- MarkRecoveryComplete(spriteID)
- int spriteID;
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
- if (hashPtr != (Hash_Entry *)NIL) {
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr != (RecovHostState *)NIL) {
- hostPtr->state &= ~RECOV_CRASH_CALLBACKS;
- Sync_Broadcast(&hostPtr->recovery);
- }
- }
- UNLOCK_MONITOR;
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_GetHostState --
- *
- * This looks into the host table to see and provides a guess
- * as to the host's current state. It uses a timestamp kept in
- * the host state to see if there's been recent message traffic.
- * If so, RECOV_HOST_ALIVE is returned. If not, RECOV_STATE_UNKNOWN
- * is returned and the caller should ping to make sure. Finally,
- * if it is known that the host is down already, then RECOV_HOST_DEAD
- * is returned.
- *
- * Results:
- * RECOV_STATE_UNKNOWN if the caller should ping to make sure.
- * RECOV_HOST_ALIVE if the host is up (recent message traffic).
- * RECOV_HOST_DEAD if the host is down (recent timeouts).
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY int
- Recov_GetHostState(spriteID)
- int spriteID;
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
- register int state = RECOV_STATE_UNKNOWN;
- Time time;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
- if (hashPtr != (Hash_Entry *)NIL) {
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr != (RecovHostState *)NIL) {
- state = hostPtr->state &
- (RECOV_HOST_ALIVE|RECOV_HOST_BOOTING|RECOV_HOST_DEAD);
- if (state & (RECOV_HOST_ALIVE|RECOV_HOST_BOOTING)) {
- /*
- * Check for recent message traffic before admitting
- * that the other machine is up.
- */
- Timer_GetTimeOfDay(&time, (int *)NIL, (Boolean *)NIL);
- Time_Subtract(time, hostPtr->time, &time);
- if (Time_GT(time, time_TenSeconds)) {
- state = RECOV_STATE_UNKNOWN;
- }
- }
- }
- }
- UNLOCK_MONITOR;
- return(state);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_GetHostOldState --
- *
- * This looks into the host table to see what was the host's old
- * state the last time it was pinged.
- *
- * Results:
- * RECOV_STATE_UNKNOWN if unsure.
- * RECOV_HOST_ALIVE if the host was up (recent message traffic).
- * RECOV_HOST_DEAD if the host was down (recent timeouts).
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY int
- Recov_GetHostOldState(spriteID)
- int spriteID;
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
- register int oldState = RECOV_STATE_UNKNOWN;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
- if (hashPtr != (Hash_Entry *)NIL) {
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr != (RecovHostState *)NIL) {
- oldState = hostPtr->oldState &
- (RECOV_HOST_ALIVE|RECOV_HOST_BOOTING|RECOV_HOST_DEAD);
- }
- }
- UNLOCK_MONITOR;
- return(oldState);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_SetHostOldState --
- *
- * This sets the oldState of the host to what it was when pinged
- * this time, for reference the next time it's pinged.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Sets oldState in host info.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY void
- Recov_SetHostOldState(spriteID, state)
- int spriteID;
- int state;
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
- if (hashPtr != (Hash_Entry *)NIL) {
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr != (RecovHostState *)NIL) {
- hostPtr->oldState = state;
- }
- }
- UNLOCK_MONITOR;
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * RecovGetLastHostState --
- *
- * This looks into the host table to pass back the
- * host's current state. It just uses whatever state the
- * host has marked currently, and does no further interpretation.
- *
- * Results:
- * hostPtr->state
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY int
- RecovGetLastHostState(spriteID)
- int spriteID;
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr;
- register int state = RECOV_STATE_UNKNOWN;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
- if (hashPtr != (Hash_Entry *)NIL) {
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr != (RecovHostState *)NIL) {
- state = hostPtr->state;
- }
- }
- UNLOCK_MONITOR;
- return(state);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * RecovCheckHost --
- *
- * This decides if we should check up on a host. If there has
- * been recent message traffic there is no need to ping now,
- * but we should check again later. If there has been no
- * message traffic our caller should ping. Finally, if
- * there are no reboot callbacks associated with the host,
- * then we are not interested anymore. Thus there are three
- * values to return.
- *
- * Results:
- * -1 if we are no longer interested in the host.
- * 0 if the host is presumably up and we don't have to ping.
- * 1 if our caller should ping.
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY int
- RecovCheckHost(spriteID)
- int spriteID;
- {
- register Hash_Entry *hashPtr;
- register RecovHostState *hostPtr = (RecovHostState *)NIL;
- register int check = -1; /* forget about the host */
- register int state;
-
- LOCK_MONITOR;
-
- hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
- if (hashPtr != (Hash_Entry *)NIL) {
- hostPtr = (RecovHostState *)hashPtr->value;
- if ((hostPtr != (RecovHostState *)NIL) &&
- (!List_IsEmpty(&hostPtr->rebootList))) {
- state = hostPtr->state &
- (RECOV_HOST_ALIVE|RECOV_HOST_BOOTING|RECOV_HOST_DEAD);
- if (state & (RECOV_HOST_ALIVE|RECOV_HOST_BOOTING)) {
- /*
- * Check for recent message traffic before admitting
- * that the other machine is up.
- */
- Time time;
- Timer_GetTimeOfDay(&time, (int *)NIL, (Boolean *)NIL);
- Time_Subtract(time, hostPtr->time, &time);
- if (Time_GT(time, time_TenSeconds)) {
- check = 1; /* ping the host now */
- } else {
- check = 0; /* ping the host maybe next time */
- }
- } else if (state & RECOV_HOST_DEAD) {
- check = 1; /* ping the host now */
- }
- }
- }
- if (check < 0 && hostPtr != (RecovHostState *)NIL) {
- hostPtr->state &= ~RECOV_PINGING_HOST;
- }
- UNLOCK_MONITOR;
- return(check);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_GetStats --
- *
- * Return the Recov_Stats to user-level, and perhaps more information
- * about our internal opinion of other hosts.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Copies data out to user-space.
- *
- *----------------------------------------------------------------------
- */
- ReturnStatus
- Recov_GetStats(size, userAddr)
- int size;
- Address userAddr;
- {
- ReturnStatus status;
- #ifdef notdef
- int extraSpace = -1;
- #endif
- if (size <= 0) {
- return(GEN_INVALID_ARG);
- }
- /*
- * See if the caller wants more than just statistics.
- */
- if (size > sizeof(Recov_Stats)) {
- #ifdef notdef
- extraSpace = size - sizeof(Recov_Stats);
- #endif
- size = sizeof(Recov_Stats);
- }
- status = Vm_CopyOut(size, (Address)&recov_Stats, userAddr);
-
- #ifdef notdef
- if (extraSpace > sizeof(int)) {
- /*
- * Fill the user-space buffer with a count of hosts,
- * and then information about each host.
- */
- userAddr += sizeof(Recov_Stats);
- status = Recov_DumpState(extraSpace, userAddr);
- }
- #endif notdef
- return(status);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_DumpState --
- *
- * Dump internal state to user-level.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Copies data out to user-space.
- *
- *----------------------------------------------------------------------
- */
- ReturnStatus
- Recov_DumpState(size, userAddr)
- int size;
- Address userAddr;
- {
- ReturnStatus status = SUCCESS;
- int numHosts, maxHosts;
- int *countPtr;
- int spriteID;
- Recov_State recovState;
-
- /*
- * We return a count, plus count number of Recov_State structures.
- */
- maxHosts = (size - sizeof(int)) / sizeof(Recov_State);
- countPtr = (int *)userAddr;
- if ((maxHosts == 0) && (size > sizeof(int))) {
- status = Vm_CopyOut(sizeof(int), (Address)&maxHosts, (Address)countPtr);
- return(status);
- }
- userAddr += sizeof(int);
- /*
- * Brute force. Run through til MAX_HOSTS and try to grab
- * the state from the hash table.
- */
- numHosts = 0;
- for (spriteID = 1 ; spriteID < NET_NUM_SPRITE_HOSTS ; spriteID++) {
- if (Recov_GetHostInfo(spriteID, &recovState)) {
- status = Vm_CopyOut(sizeof(recovState), (Address)&recovState,
- userAddr);
- if (status != SUCCESS) {
- return(status);
- }
- userAddr += sizeof(recovState);
- numHosts++;
- if (numHosts >= maxHosts) {
- break;
- }
- }
- }
-
- return(status);
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_GetHostInfo --
- *
- * Get the internal state about a host.
- *
- * Results:
- * Fills in a Recov_State structure and returns TRUE,
- * otherwise, if we don't know about the host, returns FALSE
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
- ENTRY Boolean
- Recov_GetHostInfo(spriteID, recovStatePtr)
- int spriteID;
- Recov_State *recovStatePtr;
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
- Boolean found = FALSE;
-
- LOCK_MONITOR;
-
- if (spriteID <= 0 || spriteID == rpc_SpriteID) {
- goto exit;
- } else {
- hashPtr = Hash_LookOnly(recovHashTable, (Address)spriteID);
- if (hashPtr == (Hash_Entry *)NULL || hashPtr->value == (Address)NIL) {
- goto exit;
- } else {
- hostPtr = (RecovHostState *)hashPtr->value;
- }
- recovStatePtr->spriteID = spriteID;
- recovStatePtr->state = hostPtr->state;
- recovStatePtr->clientState = hostPtr->clientState;
- recovStatePtr->bootID = hostPtr->bootID;
- recovStatePtr->time = hostPtr->time;
- found = TRUE;
- }
- exit:
- UNLOCK_MONITOR;
- return(found);
- }
-
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_PrintTraceRecord --
- *
- * Format and print the client data part of a recovery trace record.
- *
- * Results:
- * None.
- *
- * Side effects:
- * printf to the display.
- *
- *----------------------------------------------------------------------
- */
- int
- Recov_PrintTraceRecord(clientData, event, printHeaderFlag)
- ClientData clientData; /* Client data in the trace record */
- int event; /* Type, or event, from the trace record */
- Boolean printHeaderFlag; /* If TRUE, a header line is printed */
- {
- RecovTraceRecord *recPtr = (RecovTraceRecord *)clientData;
- char name[128];
- if (printHeaderFlag) {
- /*
- * Print column headers and a newline.
- */
- printf("%10s %10s %17s\n", "Host", "State", "Event ");
- }
- if (clientData != (ClientData)NIL) {
- Net_SpriteIDToName(recPtr->spriteID, 128, name);
- if (*name == '\0') {
- printf("%10d ", recPtr->spriteID);
- } else {
- printf("%10s ", name);
- }
- printf("%-8s", GetState(recPtr->state));
- printf("%3s", (recPtr->state & RECOV_CRASH_CALLBACKS) ?
- " C " : " ");
- printf("%3s", (recPtr->state & RECOV_PINGING_HOST) ?
- " P " : " ");
- printf("%3s", (recPtr->state & RECOV_REBOOT_CALLBACKS) ?
- " R " : " ");
- printf("%3s", (recPtr->state & RECOV_WANT_RECOVERY) ?
- " W " : " ");
- printf("%3s", (recPtr->state & RECOV_FAILURE) ?
- " F " : " ");
- switch(event) {
- case RECOV_CUZ_WAIT:
- printf("waiting");
- break;
- case RECOV_CUZ_WAKEUP:
- printf("wakeup");
- break;
- case RECOV_CUZ_INIT:
- printf("init");
- break;
- case RECOV_CUZ_REBOOT:
- printf("reboot");
- break;
- case RECOV_CUZ_CRASH:
- printf("crash");
- break;
- case RECOV_CUZ_CRASH_UNDETECTED:
- printf("crash undetected");
- break;
- case RECOV_CUZ_DONE:
- printf("done");
- break;
- case RECOV_CUZ_PING_ASK:
- printf("ping (ask)");
- break;
- case RECOV_CUZ_PING_CHK:
- printf("ping (check)");
- break;
- case RECOV_TRACE_FS_STALE:
- printf("stale FS handle");
- break;
- case RECOV_CUZ_SCHED_CALLBACK:
- printf("scheduled callback");
- break;
- case RECOV_CUZ_DONE_CALLBACKS:
- printf("done callbacks");
- break;
- case RECOV_CUZ_FAILURE:
- printf("failure during callbacks");
- break;
- case RECOV_CUZ_WAS_BOOTING:
- printf("was booting");
- break;
- case RECOV_CUZ_NOW_BOOTING:
- printf("now booting");
- break;
- case RECOV_CUZ_WAS_DEAD:
- printf("was dead");
- break;
- case RECOV_CUZ_DOING_CALLBACKS:
- printf("doing callbacks");
- break;
- case RECOV_CUZ_START:
- printf("start-up");
- break;
- default:
- printf("(%x)", event);
- break;
- }
- /* Our caller prints a newline */
- }
- return 0;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_PrintTrace --
- *
- * Dump out the recovery trace. Called via a console L1 keystroke.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Prints to the console.
- *
- *----------------------------------------------------------------------
- */
-
- void
- Recov_PrintTrace(clientData)
- ClientData clientData;
- {
- int numRecs = (int)clientData;
-
- if (numRecs <= 0 || numRecs > recovTraceLength) {
- numRecs = recovTraceLength;
- }
- printf("RECOVERY TRACE\n");
- (void)Trace_Print(recovTraceHdrPtr, numRecs, Recov_PrintTraceRecord);
- Recov_PrintState();
- RecovPrintPingList();
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_PrintState --
- *
- * Dump out the recovery state. Called via a console L1 keystroke.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Prints to the console.
- *
- *----------------------------------------------------------------------
- */
-
- void
- Recov_PrintState()
- {
- Hash_Search hashSearch;
- register Hash_Entry *hashEntryPtr;
- register RecovHostState *hostPtr;
- char hostName[128];
- Time_Parts timeParts;
- Time bootTime;
- int localOffset; /* minute offset for our tz */
- Time currentTime;
-
- printf("RECOVERY STATE\n");
- Hash_StartSearch(&hashSearch);
- for (hashEntryPtr = Hash_Next(recovHashTable, &hashSearch);
- hashEntryPtr != (Hash_Entry *)NIL;
- hashEntryPtr = Hash_Next(recovHashTable, &hashSearch)) {
- hostPtr = (RecovHostState *)hashEntryPtr->value;
- if (hostPtr != (RecovHostState *)NIL) {
-
- Net_SpriteIDToName(hostPtr->spriteID, 128, hostName);
- printf("%-14s %-8s", hostName, GetState(hostPtr->state));
- printf(" bootID 0x%8x", hostPtr->bootID);
-
- /*
- * Print out boot time in our timezone.
- */
- Timer_GetTimeOfDay(¤tTime, &localOffset, (Boolean *) NIL);
- bootTime.seconds = hostPtr->bootID;
- bootTime.microseconds = 0;
- bootTime.seconds += (localOffset * 60);
- Time_ToParts(bootTime.seconds, FALSE, &timeParts);
- timeParts.month++; /* So Jan is 1, not 0 */
- printf(" %d/%d/%d %d:%02d:%02d ", timeParts.month,
- timeParts.dayOfMonth,
- timeParts.year, timeParts.hours, timeParts.minutes,
- timeParts.seconds);
-
- /*
- * Print seconds ago we last heard from host.
- */
- printf(" %d ", currentTime.seconds - hostPtr->time.seconds);
- PrintExtraState(hostPtr);
- printf("\n");
- }
- }
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * GetState --
- *
- * Return a printable string for the host's state.
- *
- * Results:
- * A pointer to a string.
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
- static char *
- GetState(state)
- int state;
- {
- switch(state & (RECOV_HOST_ALIVE|RECOV_HOST_DEAD|RECOV_HOST_BOOTING)) {
- default:
- case RECOV_STATE_UNKNOWN:
- return("Unknown");
- case RECOV_HOST_ALIVE:
- return("Alive");
- case RECOV_HOST_BOOTING:
- return("Booting");
- case RECOV_HOST_DEAD:
- return("Dead");
- }
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * RecovExtraState --
- *
- * Prints out strings for various auxilliary state bits.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Prints out stuff.
- *
- *----------------------------------------------------------------------
- */
- static void
- PrintExtraState(hostPtr)
- RecovHostState *hostPtr;
- {
- if (hostPtr->state & RECOV_CRASH_CALLBACKS) {
- printf("Crash callbacks ");
- }
- if (hostPtr->state & RECOV_WANT_RECOVERY) {
- printf("Want recovery ");
- }
- if (hostPtr->state & RECOV_REBOOT_CALLBACKS) {
- printf("Reboot callbacks ");
- }
- if (hostPtr->state & RECOV_FAILURE) {
- printf("Failure ");
- }
- if (hostPtr->clientState & CLT_RECOV_IN_PROGRESS) {
- printf("Clt-inprogress ");
- }
- if (hostPtr->clientState & SRV_RECOV_IN_PROGRESS) {
- printf("Srv-inprogress ");
- }
- }
-
-
- void
- Recov_ChangePrintLevel(newLevel)
- int newLevel;
- {
- recov_PrintLevel = newLevel;
- return;
- }
-
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_InitServerDriven --
- *
- * Initialize system to use server-driven recovery.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Turns on this kind of server recovery.
- *
- *----------------------------------------------------------------------
- */
- void
- Recov_InitServerDriven()
- {
- recov_ServerDriven = TRUE;
- printf("Recov_InitServerDriven called.\n");
- if (recov_ServerDriven && recov_Transparent) {
- printf("Recov_InitServerDriven: can't do transparent recovery too.\n");
- printf("\tTurning it off!\n");
- recov_Transparent = FALSE;
- }
-
- return;
- }
- /*
- *----------------------------------------------------------------------
- *
- * Recov_StopServerDriven --
- *
- * Stop system from using server-driven recovery.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Turns off this kind of server recovery.
- *
- *----------------------------------------------------------------------
- */
- void
- Recov_StopServerDriven()
- {
- recov_ServerDriven = FALSE;
- printf("Recov_StopServerDriven called.\n");
-
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_StartServerDrivenRecovery --
- *
- * Kick off server-driven recovery on client.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Starts recovery.
- *
- *----------------------------------------------------------------------
- */
- void
- Recov_StartServerDrivenRecovery(serverID)
- int serverID; /* ID of server requesting recovery. */
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
-
- LOCK_MONITOR;
- /* Set flag saying server-driven recovery is in progress. */
- hashPtr = Hash_Find(recovHashTable, (Address) serverID);
- if (hashPtr->value == (Address) NIL) {
- /*
- * Client may have been rebooted during server crash, so it may
- * not know about server when server contacts it for recovery.
- * That's why this isn't a panic.
- */
- printf("Recov_StartServerDrivenRecovery: don't know about server %d\n",
- serverID);
- printf("\tBut probably I should since it's trying to recover with me!");
- RECOV_INIT_HOST(hostPtr, serverID, RECOV_HOST_ALIVE, 0);
- hashPtr->value = (Address) hostPtr;
- RECOV_TRACE(serverID, RECOV_HOST_ALIVE, RECOV_CUZ_INIT);
- }
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr->state & SRV_DRIVEN_IN_PROGRESS) {
- UNLOCK_MONITOR;
- panic("Server called us for server-driven recovery more than once.\n");
- }
- hostPtr->state |= SRV_DRIVEN_IN_PROGRESS;
- /* XX Test if client already blocked. If not, panic. XX */
- Sync_Broadcast(&hostPtr->waitForServer);
- UNLOCK_MONITOR;
-
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_ServerStartingRecovery --
- *
- * Mark that server-driven recovery has started, so that we
- * can block out various rpc's.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Blocks some rpcs.
- *
- *----------------------------------------------------------------------
- */
- void
- Recov_ServerStartingRecovery()
- {
- LOCK_MONITOR;
- if (!recov_ServerDriven) {
- UNLOCK_MONITOR;
- panic("Recov_ServerStartingRecovery: server-driven not true!");
- }
- recov_BlockingRpcs = TRUE;
- UNLOCK_MONITOR;
-
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_ServerFinishedRecovery --
- *
- * Mark that server-driven recovery is finished, so that we
- * can unblock various rpc's.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Unblocks some rpcs.
- *
- *----------------------------------------------------------------------
- */
- void
- Recov_ServerFinishedRecovery()
- {
- LOCK_MONITOR;
- if (!recov_ServerDriven) {
- UNLOCK_MONITOR;
- panic("Recov_ServerFinishedRecovery: server-driven not true!");
- }
- recov_BlockingRpcs = FALSE;
- /*
- * Before turning off blocking, we should check if all up clients have
- * recovered. The "old" clients may not have. If their state
- * is marked CLT_RECOV_IN_PROGRESS, or if their finished is not after
- * their start, then we're waiting for them to go through regular
- * recovery with us. We should have a condition that client recovery
- * wakes up to check if it can turn off blocking. But since we don't want
- * to wait forever, we'd have to have a timeout also. But maybe the
- * easiest thing is to reboot everybody and not worry about backwards
- * compatibility??
- */
-
- UNLOCK_MONITOR;
-
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_HoldForRecovery --
- *
- * Are we blocking rpc's and is this an rpc that should be blocked out?
- * This is called from Rpc_Dispatch, which means it's called at
- * interrupt level, which means we can't grab a monitor lock, but
- * that this should be okay.
- *
- * Results:
- * True or false.
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
- Boolean
- Recov_HoldForRecovery(clientID, command)
- int clientID;
- int command;
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
-
- if (!recov_BlockingRpcs) {
- return FALSE;
- }
- switch (command) {
- case RPC_FS_PREFIX:
- case RPC_FS_OPEN:
- case RPC_FS_READ:
- case RPC_FS_WRITE:
- case RPC_FS_CLOSE:
- case RPC_FS_UNLINK:
- case RPC_FS_RENAME:
- case RPC_FS_MKDIR:
- case RPC_FS_RMDIR:
- case RPC_FS_MKDEV:
- case RPC_FS_LINK:
- case RPC_FS_SYM_LINK:
- case RPC_FS_GET_ATTR:
- case RPC_FS_SET_ATTR:
- case RPC_FS_GET_ATTR_PATH:
- case RPC_FS_SET_ATTR_PATH:
- case RPC_FS_GET_IO_ATTR:
- case RPC_FS_SET_IO_ATTR:
- case RPC_FS_DEV_OPEN:
- case RPC_FS_SELECT:
- case RPC_FS_IO_CONTROL:
- /* Leave consistency rpc's unblocked for recovery. */
- case RPC_FS_MIGRATE:
- case RPC_FS_RELEASE:
- case RPC_PROC_MIG_COMMAND:
- case RPC_PROC_REMOTE_CALL:
- case RPC_PROC_REMOTE_WAIT:
- case RPC_FS_RELEASE_NEW:
- return TRUE;
- break;
- default:
- break;
- }
- /*
- * If host is not marked doing recovery right now, then ignore it.
- * We do this to avoid race conditions and deadlocks with some new
- * hosts trying to talk to the server while we're recovering old hosts.
- */
- if (!Mach_AtInterruptLevel()) {
- LOCK_MONITOR;
- }
- hashPtr = Hash_Find(recovHashTable, (Address) clientID);
- if (hashPtr->value == (Address) NIL) {
- RECOV_INIT_HOST(hostPtr, clientID, RECOV_STATE_UNKNOWN, 0);
- hashPtr->value = (Address) hostPtr;
- RECOV_TRACE(clientID, RECOV_STATE_UNKNOWN, RECOV_CUZ_INIT);
- /* No, don't ignore it?? */
- if (!Mach_AtInterruptLevel()) {
- UNLOCK_MONITOR;
- }
- return TRUE;
- }
- hostPtr = (RecovHostState *)hashPtr->value;
- if ((hostPtr->clientState & CLT_DOING_SRV_RECOV) == 0) {
- if (!Mach_AtInterruptLevel()) {
- UNLOCK_MONITOR;
- }
- return TRUE;
- }
-
- if (!Mach_AtInterruptLevel()) {
- UNLOCK_MONITOR;
- }
- return FALSE;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_MarkOldClient --
- *
- * Mark this client as one running an old kernel, and thus unable
- * to allow server-drivven recovery.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Marks state of client.
- *
- *----------------------------------------------------------------------
- */
- void
- Recov_MarkOldClient(clientID)
- int clientID;
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
-
- LOCK_MONITOR;
- hashPtr = Hash_Find(recovHashTable, (Address) clientID);
- if (hashPtr->value == (Address) NIL) {
- RECOV_INIT_HOST(hostPtr, clientID, RECOV_STATE_UNKNOWN, 0);
- hashPtr->value = (Address) hostPtr;
- RECOV_TRACE(clientID, RECOV_STATE_UNKNOWN, RECOV_CUZ_INIT);
- }
- hostPtr = (RecovHostState *)hashPtr->value;
- hostPtr->clientState |= CLT_OLD_RECOV;
-
- UNLOCK_MONITOR;
-
- return;
- }
-
-
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_WaitForServerDriven --
- *
- * Make client wait for rpc from server telling it to start recovery.
- * Also set a timeout in case server gets hosed and forgets us.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Makes us wait for recovery.
- *
- *----------------------------------------------------------------------
- */
- void
- Recov_WaitForServerDriven(serverID)
- int serverID;
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
-
- LOCK_MONITOR;
- /*
- * Set something saying client is waiting for server, so that server
- * rpc can check this to make sure client is already waiting when
- * it wakes it? For now, the client checks to see if server thing
- * already tried to wake it. XXX This isn't good enough, though, if
- * server recovery finishes and then client recovery tries again and sees
- * nothing going.
- */
- hashPtr = Hash_Find(recovHashTable, (Address) serverID);
- if (hashPtr->value == (Address) NIL) {
- panic("Recov_WaitForServer: should already know about this server.");
- }
- hostPtr = (RecovHostState *) hashPtr->value;
- /* Also set timeout so that if server forgets us, we'll be okay. XX */
- while ((hostPtr->state & SRV_DRIVEN_IN_PROGRESS) == 0) {
- (void) Sync_Wait(&hostPtr->waitForServer, FALSE);
- }
-
- UNLOCK_MONITOR;
-
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_GetCurrentHostStates --
- *
- * Get the current state of hosts we know about as a server.
- *
- * Results:
- * Number of hosts in list. A negative number if there wasn't enough
- * buffer space.
- *
- * Side effects:
- * Fills in buffer with hosts and their state.
- *
- *----------------------------------------------------------------------
- */
- int
- Recov_GetCurrentHostStates(infoBuffer, bufEntries)
- Dev_ClientInfo *infoBuffer; /* The array of info entries to fill. */
- int bufEntries; /* Number of available entries. */
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
- Hash_Search hashSearch;
- int i;
-
- i = 0;
- Hash_StartSearch(&hashSearch);
- for (hashPtr = Hash_Next(recovHashTable, &hashSearch);
- hashPtr != (Hash_Entry *) NIL;
- hashPtr = Hash_Next(recovHashTable, &hashSearch)) {
- hostPtr = (RecovHostState *)hashPtr->value;
- if (hostPtr->state & RECOV_HOST_DEAD) {
- continue;
- }
- infoBuffer[i].hostNumber = hostPtr->spriteID;
- infoBuffer[i].hostState = DEV_CLIENT_STATE_NEW_HOST;
- i++;
- if (i >= bufEntries) {
- return -1;
- }
- }
-
- return i;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_MarkDoingServerRecovery --
- *
- * Mark client state as doing server recovery.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Marked.
- *
- *----------------------------------------------------------------------
- */
- void
- Recov_MarkDoingServerRecovery(clientID)
- int clientID; /* The client. */
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
-
- LOCK_MONITOR;
- hashPtr = Hash_Find(recovHashTable, (Address) clientID);
- if (hashPtr->value == (Address) NIL) {
- RECOV_INIT_HOST(hostPtr, clientID, RECOV_STATE_UNKNOWN, 0);
- hashPtr->value = (Address) hostPtr;
- RECOV_TRACE(clientID, RECOV_STATE_UNKNOWN, RECOV_CUZ_INIT);
- }
- hostPtr = (RecovHostState *)hashPtr->value;
- hostPtr->clientState |= CLT_DOING_SRV_RECOV;
-
- UNLOCK_MONITOR;
-
- return;
- }
-
- /*
- *----------------------------------------------------------------------
- *
- * Recov_UnmarkDoingServerRecovery --
- *
- * Mark client state as not doing server recovery.
- *
- * Results:
- * None.
- *
- * Side effects:
- * Marked.
- *
- *----------------------------------------------------------------------
- */
- void
- Recov_UnmarkDoingServerRecovery(clientID)
- int clientID; /* The client. */
- {
- Hash_Entry *hashPtr;
- RecovHostState *hostPtr;
-
- LOCK_MONITOR;
- hashPtr = Hash_Find(recovHashTable, (Address) clientID);
- if (hashPtr->value == (Address) NIL) {
- printf("Recov_UnmarkDoingServerRecovery: don't know about client %d\n",
- clientID);
- RECOV_INIT_HOST(hostPtr, clientID, RECOV_STATE_UNKNOWN, 0);
- hashPtr->value = (Address) hostPtr;
- RECOV_TRACE(clientID, RECOV_STATE_UNKNOWN, RECOV_CUZ_INIT);
- }
- hostPtr = (RecovHostState *)hashPtr->value;
- hostPtr->clientState &= ~(CLT_DOING_SRV_RECOV);
-
- UNLOCK_MONITOR;
-
- return;
- }
-